Twitter Data Analysis

Includes and Database connection


In [ ]:
from pymongo import MongoClient
from datetime import datetime, date, timedelta
from dateutil import parser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.basemap import Basemap
matplotlib.style.use('ggplot')

class TweetDatabase():
    def __init__(self, isNew):
        self.conn = MongoClient().data_science
        self.new = isNew
        
    def connect(self):
        if self.new:
            return self.conn.new_tweets
        else:
            return self.conn.old_tweets

class Config():
    def __init__(self, isNew):
        self.new = isNew        
        
    def period(self):
        if self.new:
            return (date(2014, 12, 15), date(2015, 3, 22))
        else:
            return (date(2014, 10, 27), date(2014, 12, 14))

Old Timetable

Tweets per day


In [ ]:
conf = Config(False)
db = TweetDatabase(False).connect()
tweets = db.find()

start, stop = conf.period()
period = pd.date_range(start, stop)

#Tweets Per Day
tpd = pd.Series(0, index = period)

for tweet in tweets:
    create_datetime = tweet['created_at']
    create_datetime = parser.parse(create_datetime, ignoretz = True)
    create_date = create_datetime.date()
    d = create_date.isoformat()
    tpd[d] += 1
    
tpd

In [ ]:
tpd.describe()

In [ ]:
fig = plt.figure()
plt.bar(tpd.index, tpd, color='b')
plt.title('Tweets per day (old timetable)')
plt.xlabel('Date')
plt.ylabel('Amount of Tweets')
plt.ylim(ymax=1800)
fig.autofmt_xdate()
plt.savefig('./../../Paper/plots/old_tweets_per_day.png')

Tweets with geolocation


In [ ]:
db = TweetDatabase(False).connect()
geotweets = db.find({'coordinates': {'$ne': None}})
m = Basemap(resolution='i', projection='merc', 
    llcrnrlat=49.0, urcrnrlat=52.0, llcrnrlon=1., urcrnrlon=8.0, lat_ts=51.0)
m.drawcountries()
m.drawcoastlines()
m.fillcontinents()
for tweet in geotweets:
    x, y = tweet['coordinates']['coordinates']
    lat, lon = m(x, y)
    m.plot(lat, lon, 'b.', alpha=0.5)
plt.title('Tweets with geolocation (old timetable)')
plt.savefig('./../../Paper/plots/old_tweets_geo.png')

print "Amount of tweets with geolocation:", geotweets.count()

Tweets per hour

Only weekdays are used!


In [ ]:
period = np.arange(24)
sr = pd.Series(0, index=period)

db = TweetDatabase(False).connect()

tweets = db.find()

for tweet in tweets:
    create_datetime = tweet['created_at']
    create_datetime = parser.parse(create_datetime, ignoretz = True)
    weekday = create_datetime.weekday()
    if weekday < 5:
        create_hour = create_datetime.time().hour
        sr[create_hour] += 1

sr

In [ ]:
sr.describe()

In [ ]:
plt.figure()
rolling = pd.rolling_mean(sr, 3, center=True)
ax_delays = sr.plot(style='--', color='b')
rolling.plot(color='b', ax=ax_delays, legend=0)
plt.xticks(np.arange(0, 24, 2))
plt.title('Tweets per hour (old timetable)')
plt.xlabel('Hour')
plt.ylabel('Tweets')
plt.ylim(ymax=2500)
plt.savefig('./../../Paper/plots/old_tweets_per_hour.png')

New Timetable

Tweets per day


In [ ]:
conf = Config(True)
db = TweetDatabase(True).connect()
tweets = db.find()

start, stop = conf.period()
period = pd.date_range(start, stop)

#Tweets Per Day
tpd = pd.Series(0, index = period)

for tweet in tweets:
    create_datetime = tweet['created_at']
    create_datetime = parser.parse(create_datetime, ignoretz = True)
    create_date = create_datetime.date()
    d = create_date.isoformat()
    tpd[d] += 1
    
tpd

In [ ]:
tpd.describe()

In [ ]:
fig = plt.figure()
plt.bar(tpd.index, tpd, color='r')
plt.title('Tweets per day (new timetable)')
plt.xlabel('Date')
plt.ylabel('Amount of Tweets')
plt.ylim(ymax=1800)
fig.autofmt_xdate()
plt.savefig('./../../Paper/plots/new_tweets_per_day.png')

Tweets with geolocation


In [ ]:
db = TweetDatabase(True).connect()
geotweets = db.find({'coordinates': {'$ne': None}})
m = Basemap(resolution='i', projection='merc', 
    llcrnrlat=49.0, urcrnrlat=52.0, llcrnrlon=1., urcrnrlon=8.0, lat_ts=51.0)
m.drawcountries()
m.drawcoastlines()
m.fillcontinents()
for tweet in geotweets:
    x, y = tweet['coordinates']['coordinates']
    lat, lon = m(x, y)
    m.plot(lat, lon, 'r.', alpha=0.5)
plt.title('Tweets with geolocation (new timetable)')
plt.savefig('./../../Paper/plots/new_tweets_geo.png')

print "Amount of tweets with geolocation:", geotweets.count()

Tweets per hour

Only weekdays are used!


In [ ]:
period = np.arange(24)
sr = pd.Series(0, index=period)

db = TweetDatabase(True).connect()

tweets = db.find()

for tweet in tweets:
    create_datetime = tweet['created_at']
    create_datetime = parser.parse(create_datetime, ignoretz = True)
    weekday = create_datetime.weekday()
    if weekday < 5:
        create_hour = create_datetime.time().hour
        sr[create_hour] += 1

sr

In [ ]:
sr.describe()

In [ ]:
plt.figure()
rolling = pd.rolling_mean(sr, 3, center=True)
ax_delays = sr.plot(style='--', color='r')
rolling.plot(color='r', ax=ax_delays, legend=0)
plt.xticks(np.arange(0, 24, 2))
plt.title('Tweets per hour (new timetable)')
plt.xlabel('Hour')
plt.ylabel('Tweets')
plt.ylim(ymax=2500)
plt.savefig('./../../Paper/plots/new_tweets_per_hour.png')